package win.devhu.testing.simple.jsoupdemo

import com.google.gson.Gson
import com.google.gson.JsonObject
import com.google.gson.JsonParser
import org.jsoup.Jsoup
import java.util.regex.Pattern
import org.jsoup.nodes.Document
import org.slf4j.Logger
import org.slf4j.LoggerFactory

/**
 * @Author: Devhu.win
 * @Description: 新浪微盘分享页下载地址遍历获取
 */

fun main(args: Array<String>){
    val vdiskDownUrl = VdiskDownUrl()
    val shareUrl = "http://vdisk.weibo.com/s/axKbwB-V65rrp" //EPUB
    println(vdiskDownUrl.isSingleFile(shareUrl))
//    vdiskDownUrl.traceChildPagesLinks(shareUrl)
//    println("完成，总共抓取到${vdiskDownUrl.totalLists.size}条数据。\n ${vdiskDownUrl.totalLists}")
}

class VdiskDownUrl{

    data class UrlInfo(val title: String,val url: String)
    data class DownloadInfo(val title: String,val shareUrl: String,val downloadUrl: String,val size: String)
    var totalLists:MutableSet<String> = mutableSetOf()
    private val gson = Gson()

    private val logger: Logger = LoggerFactory.getLogger(VdiskDownUrl::class.java)

    /**
     * 获取子页面链接
     * @url 页面地址
     * TODO:这个递归效率比较低，待优化
     * */
    fun traceChildPagesLinks(url:String){
        if(!isMissFile(url)){
            if(isSingleFile(url)){
                totalLists.add(downloadUrl(url))
            }else{
                var allLists:MutableSet<MutableSet<String>> = mutableSetOf()
                var allPages = getAllPages(url)
                allPages.forEach{
                    allLists.add(getShareLists(it)) //获取所有顶层链接
                }
                allLists.forEach{
                    it.forEach{
                        val urlInfo = gson.fromJson(it,UrlInfo::class.java)
                        traceChildPagesLinks(urlInfo.url)
                    }
                }
            }
        }
    }
    /**
     * 获取分享列表的地址
     * @url 页面地址
     * */
    private fun getShareLists(shareUserUrl: String):MutableSet<String>{
        var alists:MutableSet<String> = mutableSetOf()

        var doc = Jsoup.connect(shareUserUrl).ignoreContentType(true).get()
        var el = doc.select("div.sort_name_detail")
        var links = el.select("a")
        links.forEach{
            val title = it.text()   //标题
            val sUrl = it.attr("href").split("?")[0]   //链接，split("?")[0] 作用是把链接后的参数剔除避免对后面参数拼接产生影响
            var urlInfo = UrlInfo(title,sUrl)
            alists.add(gson.toJson(urlInfo))
        }
        return alists
    }

    /**
     * 页面包括多个文件还是单个文件，单文件才有，预览模块：filePreviewWrap 或出现不支持预览块vd_browser_box vd_browser_music
     * TODO 此方法不完善待修改 不支持预览时会有问题
     * @url 页面地址
     * */
    fun isSingleFile(url:String):Boolean{
        var doc = Jsoup.connect(url).ignoreContentType(true).get()
        val canNotPreview = doc.select("div .vd_browser_box.vd_browser_music")  //是否出现无法预览
        val isCanNotPreview = canNotPreview.text()!=null && canNotPreview.text()==""
        logger.info("${isExistElementById(doc,"filePreviewWrap")}") //出现预览
        logger.info("$canNotPreview") //出现无法预览说明
        return isExistElementById(doc,"filePreviewWrap") || isCanNotPreview
    }

    /**
     * 判断分享是否被和谐了
     * @url 页面地址
     * */
    private fun isMissFile(url:String):Boolean{
        var con = Jsoup.connect(url)
                .ignoreContentType(true)
                .ignoreHttpErrors(true)
        if(con.execute().statusCode() == 200){
            val keywordOne = con.get().getElementsContainingOwnText("主人可能正在整理或文件夹正在被审核，一会儿再来看看吧~")
            return keywordOne.text() != ""
        }else{
            logger.info("该页面被和谐了，跳过! 页面地址:${url}")
            return true
        }

    }

    /**
     * 判断是否存在分页，通过 className：vd_page_main 内的内容为空是表示没有分页数据
     * @url 页面地址
     * */
    private fun isExistPage(url:String):Boolean{
        var doc = Jsoup.connect(url).ignoreContentType(true).get()
        val content= doc.getElementsByClass("vd_page_main").text()
        return content != ""
    }

    /**
     * 获取各个分页面地址
     * @url 页面地址
     * */
    private fun getAllPages(url:String):MutableSet<String>{
        var pages:MutableSet<String> = mutableSetOf()
        var baseUrl = url.split("?")[0]
        var spiltUrl = baseUrl.split("/")
        var prefix = ""
        /**
         *http://vdisk.weibo.com/u/2260697240  用户主目录 /u
         * http://vdisk.weibo.com/s/axKbwB-V5GpNK 分享目录 /s
         */
        if(spiltUrl[3] =="s"){
            prefix = "?pn="
        }else{
            prefix = "?page="
        }
        pages.add(baseUrl+"${prefix}1")
        if(isExistPage(url)){
            var doc = Jsoup.connect(url).ignoreContentType(true).get()
            val content = doc.getElementsByClass("vd_page")
            var asize = content.select("a").size -1
            var alists = content.select("a")
            var size = Integer.parseInt(alists[asize -1].text()) //通过SIZE判断共有多少页
            for(i in 1..size){
                pages.add(baseUrl+prefix+i)
            }
        }
        return pages
    }

    /**
     * 通过ID判断元素是否存在
     * @doc Document对象
     * @elementID 元素ID
     * */
    private fun isExistElementById(doc:Document,elementId:String):Boolean{
        val content = doc.getElementById(elementId)
        return content != null
    }

    /**
     * 文件分享页获取文件下载地址
     * */
    private fun downloadUrl(shareUrl:String):String{
        val doc = Jsoup.connect(shareUrl).ignoreContentType(true).execute()
        val pattern = "fileDown\\.init.*?" + "(\\{.*?\\})" //通过正则获取整段json信息
        //    val pattern = "fileDown\\.init.*?" + "\"url\":\"(.*?)\"" //通过正则获取URL
        val matcher = Pattern.compile(pattern,Pattern.CASE_INSENSITIVE or Pattern.DOTALL).matcher(doc.body())
        var info = ""
        while (matcher.find()) { //匹配器进行匹配
            info = matcher.group(1)
        }
        var downlodaInfo = info.replace("\\/","/")     //将URL“\/”替换成“/”
        //解析整块Json信息
        val jParser = JsonParser()
        val jsonInfo = jParser.parse(downlodaInfo)  as JsonObject //json str to JsonArray
        var title = jsonInfo.get("name").asString
        var size = jsonInfo.get("size").asString
        var downlodaUrl = jsonInfo.get("url").asString
        logger.info("电子书信息：$title $size \n下载地址：$downlodaUrl")
        return gson.toJson(DownloadInfo(title,shareUrl,downlodaUrl,size)
        )
    }
}




